﻿<?php
	include_once '../php_spider/db_connection_config.php';
    set_time_limit(0); //0为无限制时间
?>
<?php
    function catch_contents()
    {
		//正则
    	// $regdata = "/<font size=\"3\">((?<first_classification>[^<]*)<br \/>){0,1}⊙?<second_classification>.{12})(?<title>\S*)\s(?<author>\D*)(?<page>\d*)/";
		
		$regdata_1 = "/<font size=\"3\">((?<first_classification>[^<]*)<br \/>){0,1}⊙(?<second_classification>.{12})\S*\s/";
		$regdata_2 = "/<font size=\"3\">([^<]*<br \/>)(?<left>([^<]*<\/?(br \/|font)>)*)<\/div>/";
		
		//获取页面
		$html = file_get_contents('http://www.qnwz.cn/html/daodu/201107/282277.html');	
		$html = iconv("GBK", "UTF-8", $html);
		if ($html == '')
		{ 
			$printstr = "<hr />出错：【错】无法打开《青年文摘》页面<hr />";
			continue;
		}
		
		//匹配页面信息
		preg_match_all($regdata_1,$html,$mdata_1);
		preg_match_all($regdata_2,$html,$mdata_2);
		
		$count_1 = count($mdata_1[0]);
		$count_2 = count($mdata_2[0]);
		
		//提取相关信息
		for ($j = 0; $j < $count_1; $j++) echo trim($mdata_1['first_classification'][$j]).'<br />';
		for ($j = 0; $j < $count_1; $j++) echo trim($mdata_1['second_classification'][$j]).'<br />';
		for ($j = 0; $j < $count_2; $j++) 
		{
			str_replace('<br />','⊙我要爬虫 ',trim($mdata_2['left'][$j]));
			
			//二次提取
			$regdata = "/⊙.{13}(?<title>\S*)\s(?<author>\D*)(?<page>\d*)/";
			preg_match_all($regdata,trim($mdata_2['left'][$j]),$mdata_3);
			$count_3 = count($mdata_3[0]);
			for ($i = 0; $i < $count_3; $i++)
			{
				echo trim($mdata_3['title'][$i]).'<br />';
				echo trim($mdata_3['author'][$i]).'<br />';
				echo trim($mdata_3['page'][$i]).'<br />';
			}
		}
	}
?>